Importing¶

In [ ]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor

from sklearn.metrics import root_mean_squared_error as rmse

from tqdm.auto import tqdm

import random

import salishsea_tools.viz_tools as sa_vi

Datasets Preparation¶

In [ ]:
def datasets_preparation(dataset):
    
    drivers = np.stack([np.ravel(dataset['Temperature_(0m-15m)']),
        np.ravel(dataset['Temperature_(15m-100m)']), np.ravel(dataset['Salinity_(0m-15m)']),
        np.ravel(dataset['Salinity_(15m-100m)']), np.ravel(dataset['Nitrate_(0m-15m)']), 
        np.ravel(dataset['Nitrate_(15m-100m)']),
        np.tile(dataset.x, len(dataset.y)),
        np.tile(dataset.y, len(dataset.x))])
    
    indx = np.where(~np.isnan(drivers).any(axis=0))
    drivers = drivers[:,indx[0]]

    diat = np.ravel(dataset['Diatom_Production_Rate'])
    diat = diat[indx[0]]

    return(drivers, diat, indx)

Regressor¶

In [ ]:
def regressor (inputs, targets):
    
    inputs = inputs.transpose()
    
    # Regressor
    scale = preprocessing.StandardScaler()
    inputs = scale.fit_transform(inputs)
    X_train, _, y_train, _ = train_test_split(inputs, targets, train_size=0.35)

    drivers = None
    diat = None
    
    inputs = None
    targets = None

    model = make_pipeline(scale, GradientBoostingRegressor(n_estimators=200))
    regr = BaggingRegressor(model, n_estimators=12, n_jobs=4).fit(X_train, y_train) 

    return (regr)

Regressor 2¶

In [ ]:
def regressor2 (inputs, targets, variable_name):
    
    inputs = inputs.transpose()
    
    # Regressor
    scale = preprocessing.StandardScaler()
    inputs2 = scale.fit_transform(inputs)

    outputs_test = regr.predict(inputs2)
   
    m = scatter_plot(targets, outputs_test, variable_name) 
    r = np.round(np.corrcoef(targets, outputs_test)[0][1],3)
    rms = rmse(targets, outputs_test)

    return (r, rms, m)

Regressor 3¶

In [ ]:
def regressor3 (inputs, targets):
    
    inputs = inputs.transpose()
    
    # Regressor
    scale = preprocessing.StandardScaler()
    inputs2 = scale.fit_transform(inputs)

    outputs_test = regr.predict(inputs2)
   
    # compute slope m and intercept b
    m, b = np.polyfit(targets, outputs_test, deg=1)
    
    r = np.round(np.corrcoef(targets, outputs_test)[0][1],3)
    rms = rmse(targets, outputs_test)

    return (r, rms, m)

Regressor 4¶

In [ ]:
def regressor4 (inputs, targets, variable_name):
    
    inputs = inputs.transpose()
    
    # Regressor
    scale = preprocessing.StandardScaler()
    inputs2 = scale.fit_transform(inputs)

    outputs = regr.predict(inputs2)

    # Post processing
    indx2 = np.full((len(diat_i.y)*len(diat_i.x)),np.nan)
    indx2[indx[0]] = outputs
    model = np.reshape(indx2,(len(diat_i.y),len(diat_i.x)))

    m = scatter_plot(targets, outputs, variable_name + str(dates[i].date())) 

    # Preparation of the dataarray 
    model = xr.DataArray(model,
        coords = {'y': diat_i.y, 'x': diat_i.x},
        dims = ['y','x'],
        attrs=dict( long_name = variable_name + "Concentration",
        units="mmol m-2"),)
                        
    plotting3(targets, model, diat_i, variable_name)

Printing¶

In [ ]:
def printing (targets, outputs, m):

    print ('The amount of data points is', outputs.size)
    print ('The slope of the best fitting line is ', np.round(m,3))
    print ('The correlation coefficient is:', np.round(np.corrcoef(targets, outputs)[0][1],3))
    print (' The root mean square error is:', rmse(targets,outputs))

Scatter Plot¶

In [ ]:
def scatter_plot(targets, outputs, variable_name):

    # compute slope m and intercept b
    m, b = np.polyfit(targets, outputs, deg=1)

    printing(targets, outputs, m)

    fig, ax = plt.subplots(2, figsize=(5,10), layout='constrained')

    ax[0].scatter(targets,outputs, alpha = 0.2, s = 10)

    lims = [np.min([ax[0].get_xlim(), ax[0].get_ylim()]),
        np.max([ax[0].get_xlim(), ax[0].get_ylim()])]

    # plot fitted y = m*x + b
    ax[0].axline(xy1=(0, b), slope=m, color='r')

    ax[0].set_xlabel('targets')
    ax[0].set_ylabel('outputs')
    ax[0].set_xlim(lims)
    ax[0].set_ylim(lims)
    ax[0].set_aspect('equal')

    ax[0].plot(lims, lims,linestyle = '--',color = 'k')

    h = ax[1].hist2d(targets,outputs, bins=100, cmap='jet', 
        range=[lims,lims], cmin=0.1, norm='log')
    
    ax[1].plot(lims, lims,linestyle = '--',color = 'k')

    # plot fitted y = m*x + b
    ax[1].axline(xy1=(0, b), slope=m, color='r')

    ax[1].set_xlabel('targets')
    ax[1].set_ylabel('outputs')
    ax[1].set_aspect('equal')

    fig.colorbar(h[3],ax=ax[1], location='bottom')

    fig.suptitle(variable_name)

    plt.show()

    return (m)

Plotting¶

In [ ]:
def plotting(variable, name):

    plt.plot(years,variable, marker = '.', linestyle = '')
    plt.xlabel('Years')
    plt.ylabel(name)
    plt.show()

Plotting 2¶

In [ ]:
def plotting2(variable,title):
    
    fig, ax = plt.subplots()

    scatter= ax.scatter(dates,variable, marker='.', c=pd.DatetimeIndex(dates).month)

    ax.legend(handles=scatter.legend_elements()[0], labels=['February','March','April'])
    fig.suptitle('Daily ' + title + ' (15 Feb - 30 Apr)')
    
    fig.show()

Plotting 3¶

In [ ]:
def plotting3(targets, model, variable, variable_name):

    fig, ax = plt.subplots(2,2, figsize = (10,15))

    cmap = plt.get_cmap('cubehelix')
    cmap.set_bad('gray')

    variable.plot(ax=ax[0,0], cmap=cmap, vmin = targets.min(), vmax =targets.max(), cbar_kwargs={'label': variable_name + ' Concentration  [mmol m-2]'})
    model.plot(ax=ax[0,1], cmap=cmap, vmin = targets.min(), vmax = targets.max(), cbar_kwargs={'label': variable_name + ' Concentration  [mmol m-2]'})
    ((variable-model) / variable * 100).plot(ax=ax[1,0], cmap=cmap, cbar_kwargs={'label': variable_name + ' Concentration  [percentage]'})

    plt.subplots_adjust(left=0.1,
        bottom=0.1, 
        right=0.95, 
        top=0.95, 
        wspace=0.35, 
        hspace=0.35)

    sa_vi.set_aspect(ax[0,0])
    sa_vi.set_aspect(ax[0,1])
    sa_vi.set_aspect(ax[1,0])


    ax[0,0].title.set_text(variable_name + ' (targets)')
    ax[0,1].title.set_text(variable_name + ' (outputs)')
    ax[1,0].title.set_text('targets - outputs')
    ax[1,1].axis('off')

    fig.suptitle(str(dates[i].date()))

    plt.show()
    

Training (Random Points)¶

In [ ]:
ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_model_var_old.nc')

ds = ds.isel(time_counter = (np.arange(0, len(ds.Diatom.time_counter),2)), 
    y=(np.arange(ds.y[0], ds.y[-1], 5)), 
    x=(np.arange(ds.x[0], ds.x[-1], 5)))

dates = pd.DatetimeIndex(ds['time_counter'].values)

drivers, diat, _ = datasets_preparation(ds)

regr = regressor(drivers, diat)

Other Years (Anually)¶

In [ ]:
years = range (2007,2024)

r_all = []
rms_all = []
slope_all = []

for year in tqdm(range (2007,2024)):
    
    dataset = ds.sel(time_counter=str(year))
    
    drivers, diat, _ = datasets_preparation(dataset)

    r, rms, m = regressor2(drivers, diat, 'Diatom ' + str(year))
    
    r_all.append(r)
    rms_all.append(rms)
    slope_all.append(m)
    
plotting(np.transpose(r_all), 'Correlation Coefficient')
plotting(np.transpose(rms_all), 'Root Mean Square Error')
plotting (np.transpose(slope_all), 'Slope of the best fitting line')
  0%|          | 0/17 [00:00<?, ?it/s]
The amount of data points is 70794
The slope of the best fitting line is  0.514
The correlation coefficient is: 0.804
 The root mean square error is: 9.860691862466333e-07
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.567
The correlation coefficient is: 0.722
 The root mean square error is: 1.0082039152321887e-06
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.495
The correlation coefficient is: 0.745
 The root mean square error is: 1.0484691844255212e-06
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.513
The correlation coefficient is: 0.724
 The root mean square error is: 1.0535132154323285e-06
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.495
The correlation coefficient is: 0.743
 The root mean square error is: 1.1026195744059351e-06
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.495
The correlation coefficient is: 0.748
 The root mean square error is: 1.0571987752736729e-06
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.535
The correlation coefficient is: 0.728
 The root mean square error is: 1.1088305128109784e-06
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.564
The correlation coefficient is: 0.729
 The root mean square error is: 1.1016923502002949e-06
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.493
The correlation coefficient is: 0.585
 The root mean square error is: 1.3206460370519641e-06
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.581
The correlation coefficient is: 0.756
 The root mean square error is: 1.0844969210169734e-06
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.573
The correlation coefficient is: 0.726
 The root mean square error is: 9.1344151557344e-07
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.658
The correlation coefficient is: 0.719
 The root mean square error is: 9.721463985278978e-07
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.648
The correlation coefficient is: 0.747
 The root mean square error is: 9.931052222113597e-07
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.481
The correlation coefficient is: 0.678
 The root mean square error is: 1.0948012521742677e-06
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.591
The correlation coefficient is: 0.811
 The root mean square error is: 8.958361029626088e-07
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.555
The correlation coefficient is: 0.724
 The root mean square error is: 9.742071105877143e-07
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.524
The correlation coefficient is: 0.718
 The root mean square error is: 1.0509766413340182e-06
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Other Years (Daily)¶

In [ ]:
r_all2 = np.array([])
rms_all2 = np.array([])
slope_all2 = np.array([])

for i in tqdm(range (0, len(ds.time_counter))):
    
    dataset = ds.isel(time_counter=i)

    drivers, diat, _ = datasets_preparation(dataset)

    r, rms, m = regressor3(drivers, diat)

    r_all2 = np.append(r_all2,r)
    rms_all2 = np.append(rms_all2,rms)
    slope_all2 = np.append(slope_all2,m)

plotting2(r_all2, 'Correlation Coefficients')
plotting2(rms_all2, 'Root Mean Square Errors')
plotting2(slope_all2, 'Slope of the best fitting line')
  0%|          | 0/640 [00:00<?, ?it/s]
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Daily Maps¶

In [ ]:
maps = random.sample(range(0,len(ds.time_counter)),10)

for i in tqdm(maps):

    dataset = ds.isel(time_counter=i)
    drivers, diat, indx = datasets_preparation(dataset)

    diat_i = dataset['Diatom_Production_Rate']

    regressor4(drivers, diat, 'Diatom Production Rate ')
  0%|          | 0/10 [00:00<?, ?it/s]
The amount of data points is 1863
The slope of the best fitting line is  0.701
The correlation coefficient is: 0.633
 The root mean square error is: 1.796589326299975e-06
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.65
The correlation coefficient is: 0.273
 The root mean square error is: 2.376407453188609e-06
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.736
The correlation coefficient is: 0.725
 The root mean square error is: 1.0378832013691575e-06
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.596
The correlation coefficient is: 0.711
 The root mean square error is: 2.1151240662632372e-06
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.575
The correlation coefficient is: 0.585
 The root mean square error is: 9.606572531131704e-07
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.853
The correlation coefficient is: 0.711
 The root mean square error is: 1.1732026484837925e-06
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.787
The correlation coefficient is: 0.65
 The root mean square error is: 1.251610783228305e-06
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  -0.13
The correlation coefficient is: -0.101
 The root mean square error is: 2.39109543919693e-06
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.557
The correlation coefficient is: 0.318
 The root mean square error is: 1.6548887349847792e-06
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  1.193
The correlation coefficient is: 0.588
 The root mean square error is: 2.141834942613649e-06
No description has been provided for this image
No description has been provided for this image
In [ ]: